Setup

#load libraries
library(ggplot2)
library(pheatmap)
library(DESeq2)
library(ggbiplot)
library(matrixStats)
library("AnnotationDbi")
library("org.Hs.eg.db")
library(pathview)
library(gage)
library(gageData)
library("survival")
library("survminer")

#Import and preprocess data
rna_seq = read.csv("RNAseq.csv", row.names=1) 
clinical_patient = read.delim("data_clinical_patient.txt")
mutations = read.delim("data_mutations.txt")

Data Processing

#Preprocess data
#Filter data where there is only 0 or a read count across all samples
rna_seq <- rna_seq[rowSums(rna_seq)>1,]
RNAseq <- rna_seq

#For some reason, there's some RNA_seq samples not associated with any patient data. we remove these:
RNAseq = RNAseq[,which(substr(colnames(RNAseq),1,12) %in% 
        gsub("-",".",clinical_patient$X.Patient.Identifier[-c(1,2,3,4)]))]

Data Exploration

#Looking at possible factors that may have differences in gene expression
colnames(clinical_patient)
##  [1] "X.Patient.Identifier"                                                                       
##  [2] "Subtype"                                                                                    
##  [3] "TCGA.PanCanAtlas.Cancer.Type.Acronym"                                                       
##  [4] "Other.Patient.ID"                                                                           
##  [5] "Diagnosis.Age"                                                                              
##  [6] "Sex"                                                                                        
##  [7] "Neoplasm.Disease.Stage.American.Joint.Committee.on.Cancer.Code"                             
##  [8] "American.Joint.Committee.on.Cancer.Publication.Version.Type"                                
##  [9] "Last.Communication.Contact.from.Initial.Pathologic.Diagnosis.Date"                          
## [10] "Birth.from.Initial.Pathologic.Diagnosis.Date"                                               
## [11] "Last.Alive.Less.Initial.Pathologic.Diagnosis.Date.Calculated.Day.Value"                     
## [12] "Ethnicity.Category"                                                                         
## [13] "Form.completion.date"                                                                       
## [14] "Neoadjuvant.Therapy.Type.Administered.Prior.To.Resection.Text"                              
## [15] "ICD.10.Classification"                                                                      
## [16] "International.Classification.of.Diseases.for.Oncology..Third.Edition.ICD.O.3.Histology.Code"
## [17] "International.Classification.of.Diseases.for.Oncology..Third.Edition.ICD.O.3.Site.Code"     
## [18] "Informed.consent.verified"                                                                  
## [19] "New.Neoplasm.Event.Post.Initial.Therapy.Indicator"                                          
## [20] "American.Joint.Committee.on.Cancer.Metastasis.Stage.Code"                                   
## [21] "Neoplasm.Disease.Lymph.Node.Stage.American.Joint.Committee.on.Cancer.Code"                  
## [22] "American.Joint.Committee.on.Cancer.Tumor.Stage.Code"                                        
## [23] "Person.Neoplasm.Cancer.Status"                                                              
## [24] "Primary.Lymph.Node.Presentation.Assessment"                                                 
## [25] "Prior.Diagnosis"                                                                            
## [26] "Race.Category"                                                                              
## [27] "Radiation.Therapy"                                                                          
## [28] "Patient.Weight"                                                                             
## [29] "In.PanCan.Pathway.Analysis"                                                                 
## [30] "Overall.Survival.Status"                                                                    
## [31] "Overall.Survival..Months."                                                                  
## [32] "Disease.specific.Survival.status"                                                           
## [33] "Months.of.disease.specific.survival"                                                        
## [34] "Disease.Free.Status"                                                                        
## [35] "Disease.Free..Months."                                                                      
## [36] "Progression.Free.Status"                                                                    
## [37] "Progress.Free.Survival..Months."
#Looking at sex
table(clinical_patient$Sex)
## 
##      1 Female   Male    Sex    SEX STRING 
##      1    121    251      1      1      1
#looking at tumor stage
table(clinical_patient$Neoplasm.Disease.Stage.American.Joint.Committee.on.Cancer.Code)
## 
##                                                                                                                                                     
##                                                                                                                                                  21 
##                                                                                                                                                   1 
##                                                                                                                                                   1 
##                                                                                                                         AJCC_PATHOLOGIC_TUMOR_STAGE 
##                                                                                                                                                   1 
##                                                                                                                                             STAGE I 
##                                                                                                                                                 173 
##                                                                                                                                            STAGE II 
##                                                                                                                                                  87 
##                                                                                                                                           STAGE III 
##                                                                                                                                                   3 
##                                                                                                                                          STAGE IIIA 
##                                                                                                                                                  65 
##                                                                                                                                          STAGE IIIB 
##                                                                                                                                                   9 
##                                                                                                                                          STAGE IIIC 
##                                                                                                                                                   8 
##                                                                                                                                            STAGE IV 
##                                                                                                                                                   3 
##                                                                                                                                           STAGE IVA 
##                                                                                                                                                   1 
##                                                                                                                                           STAGE IVB 
##                                                                                                                                                   2 
##                                                                                                                                              STRING 
##                                                                                                                                                   1 
## The extent of a cancer, especially whether the disease has spread from the original site to other parts of the body based on AJCC staging criteria. 
##                                                                                                                                                   1
#looking at tumor vs tumor free
table(clinical_patient$Person.Neoplasm.Cancer.Status)
## 
##                                                             1 
##                             27                              1 
## Person neoplasm cancer status.  PERSON_NEOPLASM_CANCER_STATUS 
##                              1                              1 
##                         STRING                     Tumor Free 
##                              1                            232 
##                     With Tumor 
##                            113

Analysis on patient features such as sex

countData <- rna_seq

#create conditions
temp = data.frame(sex = clinical_patient$Sex[-c(1,2,3,4)],
                  identity = gsub("-",".",clinical_patient$X.Patient.Identifier[-c(1,2,3,4)]))

category = vector()
knowns = vector()
for(i in colnames(countData)){
  for(j in temp$identity) 
    if(grepl(j,i,fixed = TRUE)){
    category = c(category, temp$sex[temp$identity==j])
    knowns = c(knowns,i)
    break
  }
}
colData = data.frame(sex = category)
rownames(colData)= knowns

countData = countData[,knowns]

#Running the differential expression pipeline
dds = DESeqDataSetFromMatrix(countData=round(countData),
                              colData=colData,
                              design=~sex)
dds = DESeq(dds)

#Building the results table
res <- results(dds)
res = results(dds, contrast=c("sex", "Male", "Female"))
mcols(res, use.names = TRUE)
## DataFrame with 6 rows and 2 columns
##                        type            description
##                 <character>            <character>
## baseMean       intermediate mean of normalized c..
## log2FoldChange      results log2 fold change (ML..
## lfcSE               results standard error: sex ..
## stat                results Wald statistic: sex ..
## pvalue              results Wald test p-value: s..
## padj                results   BH adjusted p-values
summary(res)
## 
## out of 55071 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up)       : 3475, 6.3%
## LFC < 0 (down)     : 4911, 8.9%
## outliers [1]       : 0, 0%
## low counts [2]     : 20292, 37%
## (mean count < 0)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
# Variance stabilizing transformation
vsd <- vst(dds)

#PCA
plotPCA(vsd,intgroup=c("sex"))

#Filter by padj value and then log2FoldChange to find significant differentially expressed genes
resSig <- subset(res, padj < 0.05)
resSig_1 <- subset(resSig, log2FoldChange < -1 | log2FoldChange > 1)

genes <- order(resSig_1$padj,decreasing = TRUE)

#patient-gene heatmap
annot_col = data.frame(colData$sex)
row.names(annot_col) <- rownames(colData)

sampleMatrix <- assay(vsd)[genes,]

rownames(sampleMatrix) = rownames(countData[genes,])
colnames(sampleMatrix) = colnames(countData)

pheatmap(sampleMatrix , cluster_rows=FALSE, show_rownames=FALSE,
         show_colnames=FALSE,clustering_method = "ward.D",
         cluster_cols=TRUE, annotation_col=annot_col)

Analysis on patient features such as tumor stages

countData <- rna_seq

#create conditions
temp = data.frame(stage = clinical_patient$Neoplasm.Disease.Stage.American.Joint.Committee.on.Cancer.Code[-c(1,2,3,4)],
                  identity = gsub("-",".",clinical_patient$X.Patient.Identifier[-c(1,2,3,4)]))

category = vector()
knowns = vector()
for(i in colnames(countData)){
  for(j in temp$identity) 
    if(grepl(j,i,fixed = TRUE)){
    category = c(category, temp$stage[temp$identity==j])
    knowns = c(knowns,i)
    break
  }
}
colData = data.frame(stage = category)

#Grouping Data
colData[colData == "STAGE IIIA"] <- "STAGE III"
colData[colData == "STAGE IIIB"] <- "STAGE III"
colData[colData == "STAGE IIIC"] <- "STAGE III"
colData[colData == "STAGE IVA"] <- "STAGE IV"
colData[colData == "STAGE IVB"] <- "STAGE IV"

rownames(colData)= knowns

countData = countData[,knowns]

#Running the differential expression pipeline
dds = DESeqDataSetFromMatrix(countData=round(countData),
                              colData=colData,
                              design=~stage)
dds = DESeq(dds)

#Building the results table
res <- results(dds)
mcols(res, use.names = TRUE)
## DataFrame with 6 rows and 2 columns
##                        type            description
##                 <character>            <character>
## baseMean       intermediate mean of normalized c..
## log2FoldChange      results log2 fold change (ML..
## lfcSE               results standard error: stag..
## stat                results Wald statistic: stag..
## pvalue              results Wald test p-value: s..
## padj                results   BH adjusted p-values
summary(res)
## 
## out of 55071 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up)       : 41, 0.074%
## LFC < 0 (down)     : 20, 0.036%
## outliers [1]       : 0, 0%
## low counts [2]     : 28834, 52%
## (mean count < 2)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
#Variance stabilizing transformation
vsd <- vst(dds)

#PCA
plotPCA(vsd,intgroup=c("stage"))

#Filter by padj value and then log2FoldChange to find significant differentially expressed genes
resSig <- subset(res, padj < 0.05)
resSig_1 <- subset(resSig, log2FoldChange < -1 | log2FoldChange > 1)

genes <- order(resSig_1$padj,decreasing = TRUE)

#patient-gene heatmap
annot_col = data.frame(colData$stage)
row.names(annot_col) <- rownames(colData)

sampleMatrix <- assay(vsd)[genes,]

rownames(sampleMatrix) = rownames(countData[genes,])
colnames(sampleMatrix) = colnames(countData)

pheatmap(sampleMatrix , cluster_rows=FALSE, show_rownames=FALSE,
         show_colnames=FALSE,clustering_method = "ward.D",
         cluster_cols=TRUE, annotation_col=annot_col)

Expression data analysis

#get variance of each expression:
Expression_variance = data.frame(Expressions = rownames(RNAseq))
Expression_variance$variance = rowSds(as.matrix(RNAseq))
Expression_variance = Expression_variance[order(Expression_variance$variance, decreasing=TRUE),]

#Take the top 500 expressions in terms of variance, then do PCA analysis:
Top_500 = Expression_variance$Expressions[1:500]
Culled_RNA = scale(t(RNAseq[Top_500,]))
C.RNA.pca = prcomp(Culled_RNA, center = TRUE, scale. = TRUE)

#Choose the PC's that add up to 90% of the variance:
totvar = 0
PCcount = 0 #This is the number of PCs to use in clustering
while (totvar < 0.9){
  PCcount = PCcount + 1
  totvar = totvar + summary(C.RNA.pca)$importance[2,PCcount]
}
#Create clusters based on that PC's that show 90% of variance
distmat = dist(C.RNA.pca$x[,1:PCcount], method = 'euclidean')
hclust_ward = hclust(distmat, method = 'ward.D')
plot(hclust_ward, labels = FALSE)
rect.hclust(hclust_ward, k = 4, border = 2:6)

fourclust = cutree(hclust_ward, k = 4)
colData1 = data.frame(cluster = fourclust)
colData1$cluster <- sapply(colData1$cluster, as.character)

dds = DESeqDataSetFromMatrix(countData = RNAseq,
                             colData=colData1,
                             design=~cluster)

dds = DESeq(dds)
res = results(dds)
#Get only the significant and differentially expressed genes
res_sig = subset(res, padj < 0.05)
res_sig = subset(res_sig, log2FoldChange < -1 | log2FoldChange > 1)
res_sig = res_sig[order(res_sig$log2FoldChange, decreasing = TRUE), ]
#Variance stabilizing transformation
RNAseq2.0 = RNAseq[rownames(res_sig),]
vsd1 <- vst(dds)

#Patient-gene heatmap
annot_col = data.frame(colData1$cluster)
row.names(annot_col) <- rownames(colData1)

sampleMatrix <- assay(vsd1)[rownames(res_sig),]

rownames(sampleMatrix) = rownames(RNAseq2.0)
colnames(sampleMatrix) = colnames(RNAseq2.0)

pheatmap(sampleMatrix , cluster_rows=FALSE, show_rownames=TRUE, show_colnames = FALSE,
         cluster_cols=TRUE, clustering_method = "ward.D", annotation_col=annot_col)

#PCA plot
pcaData <- plotPCA(vsd1, intgroup=c("cluster"), returnData=TRUE)
percentVar <- round(100 * attr(pcaData, "percentVar"))
ggplot(pcaData, aes(PC1, PC2, color=cluster)) + 
  geom_point(size=3) +
  xlab(paste0("PC1: ",percentVar[1],"% variance")) +
  ylab(paste0("PC2: ",percentVar[2],"% variance")) + 
  coord_fixed()

#Pathway analysis on the patient cluster DE genes
tmp=gsub("\\..*","",row.names(res))
res$entrez = mapIds(org.Hs.eg.db,
                    keys=tmp, 
                    column="ENTREZID",
                    keytype="ENSEMBL",
                    multiVals="first")
data(kegg.sets.hs)
data(sigmet.idx.hs)

# Focus on signaling and metabolic pathways only
kegg.sets.hs = kegg.sets.hs[sigmet.idx.hs]

#Look at all significant genes/pathways - padj < 0.06
#We were getting trouble with GAGE when trying to get pathways with padj < 0.05 so we used 0.06 instead
res = subset(res, padj < 0.06)
foldchanges = res$log2FoldChange
names(foldchanges) = res$entrez
foldchanges <- foldchanges[!is.na(names(foldchanges))]
head(foldchanges)
##      64102       2268       2519       2729       4800       6405 
## -1.2030675  0.6069746  0.2269985 -0.4761025  0.2182152 -0.2633002
keggres = gage(foldchanges, gsets=kegg.sets.hs)

## Focus on top upregulated and downregulated pathways
keggrespathwaysup <- rownames(keggres$greater)[1] #change to [1:5] to look at top 5 pathways
keggrespathwaysdown <- rownames(keggres$less)[1]

# Extract the 8 character long IDs part of each string
keggresidsup = substr(keggrespathwaysup, start=1, stop=8)
keggresidsdown = substr(keggrespathwaysdown, start=1, stop=8)
pathview(gene.data=foldchanges, pathway.id=keggresidsup, species="hsa")
pathview(gene.data=foldchanges, pathway.id=keggresidsdown, species="hsa")
#Survival analysis on the patient clusters
clin_df = clinical_patient[, 
                           c("X.Patient.Identifier",
                             "Overall.Survival.Status",
                             "Overall.Survival..Months.")]
clin_df = clin_df[-c(1,2,3,4),]
Overall_survival = c()
survival_months = c()

#Possible affected groups
age = c()
race = c()
sex = c()
stage = c()
for (i in rownames(colData1)){
  index = which(gsub("-",".",clin_df$X.Patient.Identifier) == substr(i,1,12))
  Overall_survival = c(Overall_survival, clin_df$Overall.Survival.Status[index])
  survival_months = c(survival_months, clin_df$Overall.Survival..Months.[index])
  sex = c(sex, clinical_patient$Sex[index+4])
  race = c(race, clinical_patient$Race.Category[index+4])
  age = c(age, clinical_patient$Diagnosis.Age[index+4])
  stage = c(stage, clinical_patient$Neoplasm.Disease.Stage.American.Joint.Committee.on.Cancer.Code[index+4])
}

colData1$Overall.Survival.Status = Overall_survival
colData1$Overall.Survival..Months. = survival_months
colData1$sex = sex
colData1$age = age
colData1$race = race
colData1$stage = stage

colData1$deceased = colData1$Overall.Survival.Status == "1:DECEASED"
colData1$Overall.Survival..Months. <- as.numeric(colData1$Overall.Survival..Months.)
fit = survfit(Surv(Overall.Survival..Months., deceased) ~ cluster, data=colData1)

print(fit)
## Call: survfit(formula = Surv(Overall.Survival..Months., deceased) ~ 
##     cluster, data = colData1)
## 
##    1 observation deleted due to missingness 
##             n events median 0.95LCL 0.95UCL
## cluster=1 175     56   70.1    45.1      NA
## cluster=2  99     48   30.6    19.8    60.9
## cluster=3  52     35   37.3    25.3    55.7
## cluster=4  92     24   83.2    55.4      NA
ggsurvplot(fit, data=colData1, pval=T, risk.table=T, risk.table.height=0.35)

We wanted to look at potential clinical factors that affected cluster 3

table(colData1$sex[which(colData1$cluster==3)])
## 
## Female   Male 
##     23     29
table(colData1$age[which(colData1$cluster==3)])
## 
## 20 23 29 37 40 42 43 45 46 50 51 52 53 54 55 57 61 62 64 65 66 67 68 69 70 71 
##  1  2  1  1  1  1  1  1  1  1  1  2  1  1  1  1  2  2  1  1  2  2  1  2  2  1 
## 72 73 74 75 76 77 78 81 
##  3  1  1  4  5  1  1  2
table(colData1$race[which(colData1$cluster==3)])
## 
##                                               Asian Black or African American 
##                         3                         6                         6 
##                     White 
##                        37
table(colData1$stage[which(colData1$cluster==3)])
## 
##               STAGE I   STAGE II  STAGE III STAGE IIIA STAGE IIIC   STAGE IV 
##          9         18         12          3          8          1          1
#None seems promising so we wanted to look at mutation data

Creating confusion matrix based of presence of top 5 mutated and expressed genes in cluster 3

mutationgroup = rownames(colData1)[which(colData1$cluster==3)]
mutations$cluster = ifelse(substr(gsub("-",".",mutations$Tumor_Sample_Barcode),1,12) 
                           %in% substr(mutationgroup,1,12), "3","0")

test = table(mutations$Hugo_Symbol[which(mutations$cluster == "3")])
mostfrequent = order(test, decreasing = TRUE)[1:10]

#ANKHD1 is one of the top 50 mutated gene and also one of the top 5 expressed gene in cluster 3
hasmutation = substr(gsub("-",".",mutations$Tumor_Sample_Barcode[which(mutations$Hugo_Symbol == "ANKHD1")]),1,12)

prediction = ifelse(colData1$cluster == 3,TRUE,FALSE)
real = ifelse(substr(rownames(colData1),1,12) %in% hasmutation, TRUE, FALSE)

table(prediction, real)
##           real
## prediction FALSE TRUE
##      FALSE   355   12
##      TRUE     47    5
#ANKHD1, a top mutated gene, has an accuracy of ~85% which shows that it is prevalent within cluster 3

Survival analysis on patient with mutation in ANKHD1 gene

#Only looking at patient with mutation in specific gene - ANKHD1 - found in cluster 3
groupA <- substr(mutations$Tumor_Sample_Barcode[grep("ANKHD1",mutations$Hugo_Symbol)],1,12)

clinical_patient$group <- ifelse(clinical_patient$X.Patient.Identifier %in% groupA, 
                       "groupA", 
                       ifelse(!(clinical_patient$X.Patient.Identifier %in% groupA), "groupB", NA))


# we are only interested in the "With Tumor" cases for survival
clin_df = clinical_patient[clinical_patient$Person.Neoplasm.Cancer.Status == "With Tumor", 
                    c("X.Patient.Identifier",
                      "Overall.Survival.Status",
                      "Overall.Survival..Months.",
                      "group")]

# create a new boolean variable that has TRUE for dead patients
# and FALSE for live patients
clin_df$deceased = clin_df$Overall.Survival.Status == "1:DECEASED"

# Overall survival months takes into account months_to_death 
#for dead patients, and to months_to_last_follow_up for patients who
# are still alive
clin_df$Overall.Survival..Months. <- as.numeric(clin_df$Overall.Survival..Months.)


# fit a survival model
fit = survfit(Surv(Overall.Survival..Months., deceased) ~ group, data=clin_df) 

print(fit)
## Call: survfit(formula = Surv(Overall.Survival..Months., deceased) ~ 
##     group, data = clin_df)
## 
##    1 observation deleted due to missingness 
##                n events median 0.95LCL 0.95UCL
## group=groupA   3      2   18.3    12.0      NA
## group=groupB 109     59   41.8    29.6    58.9
# we produce a Kaplan Meier plot
ggsurvplot(fit, data=clin_df, pval=T, risk.table=T, risk.table.height=0.35)

Mutation Data Analysis - Looking more closely at mutation data

#Gene patient matrix
rownames = unique(mutations$Hugo_Symbol)
colnames = clinical_patient$X.Patient.Identifier[-c(1,2,3,4)]

patient_gene = matrix(0, nrow = length(rownames), ncol = length(colnames))
rownames(patient_gene) = rownames
colnames(patient_gene) = colnames

for(i in colnames){
  a = mutations$Hugo_Symbol[which(grepl(i,mutations$Tumor_Sample_Barcode))]
  for (j in a){patient_gene[j,i] = 1}
}
#Top 20 mutated gene and their number of occurrences
patient_gene <- cbind(patient_gene, count = rowSums(patient_gene))
patient_gene = patient_gene[order(patient_gene[,373],decreasing=TRUE),]
patient_gene[1:20,373]
##     TTN    TP53  CTNNB1   MUC16   OBSCN     ALB    PCLO   CSMD3    RYR2     FLG 
##     129     112      95      80      54      49      49      48      46      43 
##    APOB  ABCA13   LRP1B   XIRP2   USH2A   HMCN1   GPR98 CACNA1E    FAT3    FUT9 
##      40      38      37      36      36      35      35      34      34      33
#Modify rna_seq so it only includes patients with mutation in one of the top 50 mutated gene
PatientID <- c()
genes <-rownames(patient_gene)

for (i in 1:50){
  tmp <- mutations$Tumor_Sample_Barcode[which(mutations$Hugo_Symbol == genes[i])]
  tmp <- gsub("\\-", ".", tmp)
  PatientID <- append(PatientID,tmp)
}
PatientID <- unique(PatientID)

rna_seq_mod <- rna_seq
rna_seq_mod <- rna_seq_mod[ , (substr(names(rna_seq_mod),1,15) %in% PatientID)]
#DEseq on patient groups based on w/t vs w/out mutation in particular genes
countData <- rna_seq

patient <- colnames(countData)
colData <- data.frame(patient) 

groupA <- colnames(rna_seq_mod) #patient with mutated gene (top 50)

colData$group <- ifelse(colData$patient %in% groupA, 
                       "groupA", 
                       ifelse(!(colData$patient %in% groupA), "groupB", NA))

rownames(colData) <- colData$patient

#Running the differential expression pipeline
dds = DESeqDataSetFromMatrix(countData=round(countData ),
                              colData=colData,
                              design=~group)
dds = DESeq(dds)

#Building the results table
res <- results(dds)
res = results(dds, contrast=c("group", "groupA", "groupB"))
mcols(res, use.names = TRUE)
## DataFrame with 6 rows and 2 columns
##                        type            description
##                 <character>            <character>
## baseMean       intermediate mean of normalized c..
## log2FoldChange      results log2 fold change (ML..
## lfcSE               results standard error: grou..
## stat                results Wald statistic: grou..
## pvalue              results Wald test p-value: g..
## padj                results   BH adjusted p-values
summary(res)
## 
## out of 55073 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up)       : 13038, 24%
## LFC < 0 (down)     : 5646, 10%
## outliers [1]       : 0, 0%
## low counts [2]     : 19223, 35%
## (mean count < 0)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
#Variance stabilizing transformation
vsd <- vst(dds)

#PCA
plotPCA(vsd,intgroup=c("group"))

#Significant and differentially expressed genes
resSig <- subset(res, padj < 0.05)
resSig_1 <- subset(resSig, log2FoldChange < -1 | log2FoldChange > 1)

genes <- order(resSig_1$padj,decreasing = TRUE)

#Patient-gene heatmap
annot_col = data.frame(colData$group)
row.names(annot_col) <- rownames(colData)

sampleMatrix <- assay(vsd)[genes,]

rownames(sampleMatrix) = rownames(countData[genes,])
colnames(sampleMatrix) = colnames(countData)

pheatmap(sampleMatrix , cluster_rows=TRUE, show_rownames=FALSE,
         show_colnames = FALSE, clustering_method = "ward.D",
         cluster_cols=TRUE, annotation_col=annot_col)

#Find the gene names of the top significant and differentially expressed genes
library("AnnotationDbi")
library("org.Hs.eg.db")

ensembl_id = as.vector(rownames(sampleMatrix))
ensembl_id <- substr(ensembl_id, start=1, stop=15)

gene_name <- select(org.Hs.eg.db, keys=ensembl_id, 
                columns="SYMBOL", keytype="ENSEMBL")
head(gene_name)
#Pathway analysis on the patient cluster DE genes
tmp=gsub("\\..*","",row.names(res))
res$entrez = mapIds(org.Hs.eg.db,
                    keys=tmp, 
                    column="ENTREZID",
                    keytype="ENSEMBL",
                    multiVals="first")
data(kegg.sets.hs)
data(sigmet.idx.hs)

# Focus on signaling and metabolic pathways only
kegg.sets.hs = kegg.sets.hs[sigmet.idx.hs]

#Look at all significant genes/pathways - padj < 0.06
#We were getting trouble with GAGE when trying to get pathways with padj < 0.05 so we used 0.06 instead
res = subset(res, padj < 0.06)
foldchanges = res$log2FoldChange
names(foldchanges) = res$entrez
foldchanges <- foldchanges[!is.na(names(foldchanges))]
head(foldchanges)
##      55732       2268       3075       2519       2729       4800 
##  0.4288297 -0.6497088 -0.3444860  0.1983943 -0.2476456  0.3118879
keggres = gage(foldchanges, gsets=kegg.sets.hs)

## Focus on top upregulated and downregulated pathways
keggrespathwaysup <- rownames(keggres$greater)[1] #change to [1:5] to look at top 5 pathways
keggrespathwaysdown <- rownames(keggres$less)[1]

# Extract the 8 character long IDs part of each string
keggresidsup = substr(keggrespathwaysup, start=1, stop=8)
keggresidsdown = substr(keggrespathwaysdown, start=1, stop=8)
pathview(gene.data=foldchanges, pathway.id=keggresidsup, species="hsa")
pathview(gene.data=foldchanges, pathway.id=keggresidsdown, species="hsa")

Comparing expression with mutation analysis data
Notice that patient cluster 3 from expression data analysis have some overlaps with the non-mutated patient cluster (group B) from mutation data analysis

#PCA plot for expression analysis
plotPCA(vsd1,intgroup=c("cluster"))

#PCA plot for mutation analysis where group B is the non-mutated patient cluster
plotPCA(vsd,intgroup=c("group"))

Chekc to see if survival rates of group B is similar to cluster 3

#Survival Analysis - Patients with mutation in one of the top 50 mutated gene vs without
patient <- substr(colData$patient,1,12)
patient <- gsub("\\.", "-", patient)

groupA <- patient[grep("groupA",colData$group)] #patient with mutated gene (top 50)
groupB <- patient[grep("groupB",colData$group)] #patient without mutated gene

clinical_patient$group <- ifelse(clinical_patient$X.Patient.Identifier %in% groupA, 
                       "groupA", 
                       ifelse(clinical_patient$X.Patient.Identifier %in% groupB, "groupB", NA))


# we are only interested in the "With Tumor" cases for survival
clin_df = clinical_patient[clinical_patient$Person.Neoplasm.Cancer.Status == "With Tumor", 
                    c("X.Patient.Identifier",
                      "Overall.Survival.Status",
                      "Overall.Survival..Months.",
                      "group")]

# create a new boolean variable that has TRUE for dead patients
# and FALSE for live patients
clin_df$deceased = clin_df$Overall.Survival.Status == "1:DECEASED"

# Overall survival months takes into account months_to_death 
#for dead patients, and to months_to_last_follow_up for patients who
# are still alive
clin_df$Overall.Survival..Months. <- as.numeric(clin_df$Overall.Survival..Months.)

# fit a survival model
fit = survfit(Surv(Overall.Survival..Months., deceased) ~ group, data=clin_df)

print(fit) 
## Call: survfit(formula = Surv(Overall.Survival..Months., deceased) ~ 
##     group, data = clin_df)
## 
##    4 observations deleted due to missingness 
##               n events median 0.95LCL 0.95UCL
## group=groupA 99     56   37.3    26.4    55.7
## group=groupB 10      4   69.6    49.0      NA
# we produce a Kaplan Meier plot
ggsurvplot(fit, data=clin_df, pval=T, risk.table=T, risk.table.height=0.35)

It seems that cluster 3 and Group B may be in some way related, as their clustering is very similar. However, this does not mean that cluster 3 is group B as shown by the discrepancy in their survival analysis results.